Licensed under the Creative Commons attribution-noncommercial license, http://creativecommons.org/licenses/by-nc/3.0/. Please share and remix noncommercially, mentioning its origin.
CC-BY_NC

Produced in R version 3.2.1 using pomp version 0.72.2.


Objectives

  1. To demonstrate the use of diagnostic probes for model criticism
  2. To teach some forecasting methods based on POMP models

These objectives will be achieved using a recent study (King et al. 2015), all codes for which are available on datadryad.org.

Model and data

The situation as of 1 October 2014

Let’s situate ourselves at the beginning of October 2014. The WHO situation report contained data on the number of cases in each of Guinea, Sierra Leone, and Liberia.

Download the data from the WHO Situation Report of 1 October 2014:

# baseurl <- "http://kinglab.eeb.lsa.umich.edu/SBIED/"
baseurl <- "../"
read.csv(paste0(baseurl,"data/ebola_data.csv"),stringsAsFactors=FALSE,
         colClasses=c(date="Date")) -> dat
sapply(dat,class)
##        week        date     country       cases 
##   "integer"      "Date" "character"   "numeric"
head(dat)
##   week       date country cases
## 1    1 2014-01-05  Guinea 2.244
## 2    2 2014-01-12  Guinea 2.244
## 3    3 2014-01-19  Guinea 0.073
## 4    4 2014-01-26  Guinea 5.717
## 5    5 2014-02-02  Guinea 3.954
## 6    6 2014-02-09  Guinea 5.444

Supplementing these data are population estimates for the three countries.

## Population sizes in Guinea, Liberia, and Sierra Leone (census 2014)
populations <- c(Guinea=10628972,Liberia=4092310,SierraLeone=6190280)
dat %>%
  ggplot(aes(x=date,y=cases,group=country,color=country))+
  geom_line()

An SEIR model with gamma-distributed latent and infectious periods

Many of the early modeling efforts used variants on the simple SEIR model. Here, we’ll focus on a variant that attempts a more accurate description of the durations of the latent and infectious periods. Specifically, this model assumes that the amount of time an individual spends in the latent period is \[\mathrm{LP} \sim {\mathrm{Gamma}\left(m,\frac{1}{m\,\alpha}\right)},\] where \(m\) is an integer. This means that the latent period has expectation \(1/\alpha\) and variance \(1/(m\,\alpha)\). Likewise, we assume that the infectious period \[\mathrm{IP} \sim {\mathrm{Gamma}\left(n,\frac{1}{n\,\gamma}\right)}.\] In this document, we’ll fix \(m=n=3\).

We implement Gamma distributions using the so-called linear chain trick.

Process model simulator

rSim <- Csnippet('
  double lambda, beta;
  double *E = &E1;
  beta = R0 * gamma; // Transmission rate
  lambda = beta * I / N; // Force of infection
  int i;

  // Transitions
  // From class S
  double transS = rbinom(S, 1.0 - exp(- lambda * dt)); // No of infections
  // From class E
  double transE[nstageE]; // No of transitions between classes E
  for(i = 0; i < nstageE; i++){
    transE[i] = rbinom(E[i], 1.0 - exp(-nstageE * alpha * dt));
  }
  // From class I
  double transI = rbinom(I, 1.0 - exp(-gamma * dt)); // No of transitions I->R

  // Balance the equations
  S -= transS;
  E[0] += transS - transE[0];
  for(i=1; i < nstageE; i++) {
    E[i] += transE[i-1] - transE[i];
  }
  I += transE[nstageE-1] - transI;
  R += transI;
  N_EI += transE[nstageE-1]; // No of transitions from E to I
  N_IR += transI; // No of transitions from I to R
')

Deterministic skeleton

The deterministic skeleton is an ODE.

skel <- Csnippet('
  double lambda, beta;
  const double *E = &E1;
  double *DE = &DE1;
  beta = R0 * gamma; // Transmission rate
  lambda = beta * I / N; // Force of infection
  int i;

  // Balance the equations
  DS = - lambda * S;
  DE[0] = lambda * S - nstageE * alpha * E[0];
  for (i=1; i < nstageE; i++)
    DE[i] = nstageE * alpha * (E[i-1]-E[i]);
  DI = nstageE * alpha * E[nstageE-1] - gamma * I;
  DR = gamma * I;
  DN_EI = nstageE * alpha * E[nstageE-1];
  DN_IR = gamma * I;
')

Measurement model: overdispersed count data

\(C_t | H_t\) is negative binomial with \({\mathbb{E}\left[{C_t|H_t}\right]} = \rho\,H_t\) and \({\mathrm{Var}\left[{C_t|H_t}\right]} = \rho\,H_t\,(1+k\,\rho\,H_t)\).

dObs <- Csnippet('
  double f;
  if (k > 0.0)
    f = dnbinom_mu(nearbyint(cases),1.0/k,rho*N_EI,1);
  else
    f = dpois(nearbyint(cases),rho*N_EI,1);
  lik = (give_log) ? f : exp(f);
')

rObs <- Csnippet('
  if (k > 0) {
    cases = rnbinom_mu(1.0/k,rho*N_EI);
  } else {
    cases = rpois(rho*N_EI);
  }')

Parameter transformations

toEst <- Csnippet('
  const double *IC = &S_0;
  double *TIC = &TS_0;
  TR0 = log(R0);
  Trho = logit(rho);
  Tk = log(k);
  to_log_barycentric(TIC,IC,4);
')

fromEst <- Csnippet('
  const double *IC = &S_0;
  double *TIC = &TS_0;
  TR0 = exp(R0);
  Trho = expit(rho);
  Tk = exp(k);
  from_log_barycentric(TIC,IC,4);
')

The following function constructs a pomp object to hold the data for any one of the countries.

ebolaModel <- function (country=c("Guinea", "SierraLeone", "Liberia"),
                        timestep = 0.1) {

  ctry <- match.arg(country)
  pop <- unname(populations[ctry])
 
  ## Incubation period is supposed to be Gamma distributed with shape parameter 3
  ## and mean 11.4 days.  The discrete-time formula is used to calculate the
  ## corresponding alpha (cf He et al., Interface 2010).
  ## Case-fatality ratio is fixed at 0.7 (cf WHO Ebola response team, NEJM 2014)
  incubation_period <- 11.4/7
  infectious_period <- 7/7
  index_case <- 10/pop
  dt <- timestep
  nstageE <- 3L

  globs <- paste0("static int nstageE = ",nstageE,";");

  theta <- c(N=pop,R0=1.4,
             alpha=-1/(nstageE*timestep)*log(1-nstageE*timestep/incubation_period),
             gamma=-log(1-timestep/infectious_period)/timestep,
             rho=0.2,cfr=0.7,
             k=0,
             S_0=1-index_case,E_0=index_case/2-5e-9,
             I_0=index_case/2-5e-9,R_0=1e-8)

  dat <- subset(dat,country==ctry,select=-country)

  ## Create the pomp object
  dat %>% 
    extract(c("week","cases")) %>%
    pomp(
      times="week",
      t0=min(dat$week)-1,
      params=theta,
      globals=globs,
      statenames=c("S","E1","I","R","N_EI","N_IR"),
      zeronames=c("N_EI","N_IR"),
      paramnames=c("N","R0","alpha","gamma","rho","k",
                   "S_0","E_0","I_0","R_0"),
      nstageE=nstageE,
      dmeasure=dObs, rmeasure=rObs,
      rprocess=discrete.time.sim(step.fun=rSim, delta.t=timestep),
      skeleton=skel, skeleton.type="vectorfield",
      toEstimationScale=toEst,
      fromEstimationScale=fromEst,
      initializer=function (params, t0, nstageE, ...) {
        all.state.names <- c("S",paste0("E",1:nstageE),"I","R","N_EI","N_IR")
        comp.names <- c("S",paste0("E",1:nstageE),"I","R")
        x0 <- setNames(numeric(length(all.state.names)),all.state.names)
        frac <- c(params["S_0"],rep(params["E_0"]/nstageE,nstageE),params["I_0"],params["R_0"])
        x0[comp.names] <- round(params["N"]*frac/sum(frac))
        x0
      }
    ) -> po
}

ebolaModel("Guinea") -> gin
ebolaModel("SierraLeone") -> sle
ebolaModel("Liberia") -> lbr

Parameter estimates

King et al. (2015) estimated parameters for this model for each country. A large Latin hypercube design was used to initiate a large number of iterated filtering runs. Profile likelihoods were computed for each country against the parameters \(k\) (the measurement model overdispersion) and \(R_0\) (the basic reproductive ratio). Full details are given on the datadryad.org site. The following loads the results of these calculations.

options(stringsAsFactors=FALSE)
profs <- read.csv(paste0(baseurl,"/ebola/ebola-profiles.csv"))

The following plots the profile likelihoods. The horizontal line represents the critical value of the likelihood ratio test for \(p=0.01\).

require(reshape2)
require(plyr)
require(magrittr)
require(ggplot2)
theme_set(theme_bw())

profs %>% 
  melt(id=c("profile","country","loglik")) %>%
  subset(variable==profile) %>%
  ddply(~country,mutate,dll=loglik-max(loglik)) %>%
  ddply(~country+profile+value,subset,loglik==max(loglik)) %>% 
  ggplot(mapping=aes(x=value,y=dll))+
  geom_point(color='red')+
  geom_hline(yintercept=-0.5*qchisq(p=0.99,df=1))+
  facet_grid(country~profile,scales='free')+
  labs(y=expression(l))

Diagnostics

Parameter estimation is the process of finding the parameters that are “best”, in some sense, for a given model, from among the set of those that make sense for that model. Model selection, likewise, aims at identifying the “best” model, in some sense, from among a set of candidates. One can do both of these things more or less well, but no matter how carefully they are done, the best of a bad set of models is still bad.

Lets’ investigate the model here, at its maximum-likelihood parameters, to see if we can identify problems. The guiding principle in this is that, if the model is “good”, then the data are a plausible realization of that model. Therefore, we can compare the data directly against model simulations. Moreover, we can quantify the agreement between simulations and data in any way we like. Any statistic, or set of statistics, that can be applied to the data can also be applied to simulations. Shortcomings of the model should manifest themselves as discrepancies between the model-predicted distribution of such statistics and their value on the data.

pomp provides tools to facilitate this process. Specifically, the probe function applies a set of user-specified probes or summary statistics, to the model and the data, and quantifies the degree of disagreement in several ways.

Let’s see how this is done using the model for the Liberian outbreak.

library(pomp)
library(plyr)
library(reshape2)
library(magrittr)
options(stringsAsFactors=FALSE)

require(foreach)
require(doMC)

profs %>%
  subset(country=="Liberia") %>%
  subset(loglik==max(loglik),
         select=-c(loglik,loglik.se,country,profile)) %>%
  unlist() -> coef(lbr)

simulate(lbr,nsim=20,as.data.frame=TRUE,include.data=TRUE) %>% 
  mutate(date=min(dat$date)+7*(time-1)) %>% 
  ggplot(aes(x=date,y=cases,group=sim,color=(sim=="data")),
         alpha=(sim=="data"))+
  geom_line()+
  guides(color=FALSE,alpha=FALSE)+
  scale_color_manual(values=c(`TRUE`='red',`FALSE`=gray(0.8)))+
  scale_alpha_manual(values=c(`TRUE`=1,`FALSE`=0.4))

The simulations appear to be growing a bit more quickly than the data. Let’s try to quantify this. First, we’ll write a function that estimates the exponential growth rate by linear regression. Then, we’ll apply it to the data and to 500 simulations.

growth.rate <- function (y) {
  cases <- y["cases",]
  fit <- lm(log1p(cases)~seq_along(cases))
  unname(coef(fit)[2])
}
probe(lbr,probes=list(r=growth.rate),nsim=500) %>% plot()

The simulations also appear to be more highly variable around the trend than do the data.

growth.rate.plus <- function (y) {
  cases <- y["cases",]
  fit <- lm(log1p(cases)~seq_along(cases))
  c(r=unname(coef(fit)[2]),sd=sd(residuals(fit)))
}
probe(lbr,probes=list(growth.rate.plus),
      nsim=500) %>% plot()

We can look at a whole suite of probes simultaneously.

log.detrend <- function (y) {
  cases <- y["cases",]
  as.numeric(residuals(lm(log1p(cases)~seq_along(cases))))
} 

probe(lbr,probes=list(
  growth.rate.plus,
  probe.quantile(var="cases",prob=c(0.1,0.5,0.9)),
  probe.acf(var="cases",lags=c(1,2,3),type="correlation")
            # transform=log.detrend)
  ),
  nsim=500) %>% plot()

Exercise: the SEIR model for the Sierra Leone outbreak

Forecasting

Below are the contents of the file forecasts.R, which performs all the forecasting computations. In a directory containing ebola-model.R, ebola-profiles.rds, and hosts, a command like

mpirun -hostfile hosts -np 101 Rscript --vanilla forecasts.R

will result in the execution of these computations.

Contents of the file forecasts.R:

require(pomp)
require(plyr)
require(reshape2)
require(magrittr)
options(stringsAsFactors=FALSE)

set.seed(988077383L)

require(foreach)
require(doMPI)
require(iterators)

source("ebola-model.R")

horizon <- 13

foreach (country=c("SierraLeone"),.inorder=TRUE,.combine=c) %:%
  foreach (type=c("raw","cum"),.inorder=TRUE,.combine=c) %do%
  {
    M1 <- ebolaModel(country=country,type=type,
                     timestep=0.01,nstageE=3,na.rm=TRUE)
    M2 <- ebolaModel(country=country,type="raw",
                     timestep=0.01,nstageE=3,na.rm=TRUE)
    time(M2) <- seq(from=1,to=max(time(M1))+horizon,by=1)
    M3 <- ebolaModel(country=country,type="raw",
                     timestep=0.01,nstageE=3,na.rm=TRUE)
    time(M3) <- seq(from=max(time(M1))+1,to=max(time(M1))+horizon,by=1)
    timezero(M3) <- max(time(M1))
    list(M1,M2,M3)
    } -> models
dim(models) <- c(3,2,1)
dimnames(models) <- list(c("fit","det.forecast","stoch.forecast"),
                         c("raw","cum"),c("SierraLeone"))

noexport <- c("models")

## Weighted quantile function
wquant <- function (x, weights, probs = c(0.025,0.5,0.975)) {
  idx <- order(x)
  x <- x[idx]
  weights <- weights[idx]
  w <- cumsum(weights)/sum(weights)
  rval <- approx(w,x,probs,rule=1)
  rval$y
  }

starts <- c(Guinea="2014-01-05",Liberia="2014-06-01",SierraLeone="2014-06-08")

cl <- startMPIcluster()
registerDoMPI(cl)

bake <- function (file, expr) {
  if (file.exists(file)) {
    readRDS(file)
    } else {
      val <- eval(expr)
      saveRDS(val,file=file)
      val
      }
  }

readRDS("profiles.rds") %>%
  ddply(~country+type+model,subset,loglik>max(loglik)-6,
        select=-c(conv,etime,loglik.se,nfail.min,nfail.max,profile)) -> mles

mles %>% melt(id=c("country","type","model"),variable.name='parameter') %>%
  ddply(~country+type+model+parameter,summarize,
        min=min(value),max=max(value)) %>%
  subset(parameter!="loglik") %>%
  melt(measure=c("min","max")) %>%
  acast(country~type~model~parameter~variable) -> ranges

mles %>% ddply(~country+type+model,subset,loglik==max(loglik),select=-loglik) %>%
  mutate(k=round(k,4),rho=round(rho,4),R0=round(R0,4),E_0=3*round(E_0/3)) %>%
  unique() %>%
  arrange(country,type,model) -> mles

### DETERMINISTIC MODELS

bake(file="ebola-forecasts_det.rds",{
  foreach (country=c("SierraLeone"),
           .inorder=TRUE,.combine=rbind) %:%
    foreach (type=c("raw","cum"),nsamp=c(1000,3000),
             .inorder=TRUE,.combine=rbind) %do%
    {
      
      params <- sobolDesign(lower=ranges[country,type,'det',,'min'],
                            upper=ranges[country,type,'det',,'max'],
                            nseq=nsamp)
      
      foreach(p=iter(params,by='row'),
              .inorder=FALSE,
              .combine=rbind,
              .noexport=noexport,
              .options.multicore=list(set.seed=TRUE),
              .options.mpi=list(chunkSize=10,seed=1568335316L,info=TRUE)
              ) %dopar%
        {
          M1 <- models["fit",type,country][[1]]
          M2 <- models["det.forecast",type,country][[1]]
          ll <- logLik(traj.match(M1,start=unlist(p)))
          x <- trajectory(M2,params=unlist(p))
          p <- parmat(unlist(p),20)
          rmeasure(M2,x=x,times=time(M2),params=p) %>%
            melt() %>%
            mutate(time=time(M2)[time],
                   period=ifelse(time<=max(time(M1)),"calibration","projection"),
                   loglik=ll)
        } %>% 
        subset(variable=="cases",select=-variable) %>%
        mutate(weight=exp(loglik-mean(loglik))) %>%
        arrange(time,rep) -> sims
      
      ess <- with(subset(sims,time==max(time)),weight/sum(weight))
      ess <- 1/sum(ess^2)
      cat("ESS det",country,type,"=",ess,"\n")
      
      sims %>%
        ddply(~time+period,summarize,prob=c(0.025,0.5,0.975),
              quantile=wquant(value,weights=weight,probs=prob)) %>%
        mutate(prob=mapvalues(prob,from=c(0.025,0.5,0.975),
                              to=c("lower","median","upper"))) %>%
        dcast(period+time~prob,value.var='quantile') %>%
        mutate(country=country,type=type)
      }
  }) -> fc_tm

### STOCHASTIC MODEL

bake(file="ebola-forecasts_stoch.rds",{
  foreach (country=c("SierraLeone"),
           .inorder=TRUE,.combine=rbind) %:%
    foreach (type=c("raw","cum"),nsamp=c(200,200),
             .inorder=TRUE,.combine=rbind) %do%
    {
      
      params <- sobolDesign(lower=ranges[country,type,'stoch',,'min'],
                            upper=ranges[country,type,'stoch',,'max'],
                            nseq=nsamp)
      
      foreach(p=iter(params,by='row'),
              .inorder=FALSE,
              .combine=rbind,
              .noexport=noexport,
              .options.multicore=list(set.seed=TRUE),
              .options.mpi=list(chunkSize=1,seed=1568335316L,info=TRUE)
              ) %dopar%
        {
          M1 <- models["fit",type,country][[1]]
          M2 <- models["stoch.forecast",type,country][[1]]
          pf <- pfilter(M1,params=unlist(p),Np=2000,save.states=TRUE)
          pf$saved.states %>% tail(1) %>% melt() %>%
            acast(variable~rep,value.var='value') %>%
            apply(2,function (x) {
              setNames(c(x["S"],sum(x[c("E1","E2","E3")]),x["I"],x["R"]),
                       c("S_0","E_0","I_0","R_0"))}) -> x
          pp <- parmat(unlist(p),ncol(x))
          pp[rownames(x),] <- x
          simulate(M2,params=pp,obs=TRUE) %>%
            melt() %>%
            mutate(time=time(M2)[time],
                   period=ifelse(time<=max(time(M1)),"calibration","projection"),
                   loglik=logLik(pf))
        } %>% subset(variable=="cases",select=-variable) %>%
        mutate(weight=exp(loglik-mean(loglik))) %>%
        arrange(time,rep) -> sims
      
      ess <- with(subset(sims,time==max(time)),weight/sum(weight))
      ess <- 1/sum(ess^2)
      cat("ESS stoch",country,type,"=",ess,"\n")
      
      sims %>% ddply(~time+period,summarize,prob=c(0.025,0.5,0.975),
                     quantile=wquant(value,weights=weight,probs=prob)) %>%
        mutate(prob=mapvalues(prob,from=c(0.025,0.5,0.975),
                              to=c("lower","median","upper"))) %>%
        dcast(period+time~prob,value.var='quantile') %>%
        mutate(country=country,type=type)
      }
  }) -> fc_if

ldply(list(stoch=fc_if,det=fc_tm),.id='model') %>%
  ddply(~country,mutate,
        model=factor(model,levels=c("stoch","det")),
        date=as.Date(starts[unique(as.character(country))])+7*(time-1)) %>%
  saveRDS(file='forecasts.rds')

closeCluster(cl)
mpi.quit()

References

King, A. A., M. Domenech de Cellès, F. M. G. Magpantay, and P. Rohani. 2015. Avoidable errors in the modelling of outbreaks of emerging pathogens, with special reference to ebola. Proceedings of the Royal Society of London. Series B 282:20150347.